Load data

trips_per_day <- read_tsv("trips_per_day.tsv")
holiday <- read.csv("holiday.csv", header = F, col.names = c("day_num", "ymd", "holiday_name"))

# convert ymd to a date type in holiday so it can be joined
holiday$ymd <- as.Date(holiday$ymd)

Adding new predictors

# join holiday info to trips_per_day and add is_holiday_column
trips_per_day <- trips_per_day %>%
  left_join(holiday, by = "ymd") %>% 
  mutate(is_holiday = !(is.na(holiday_name))) %>% 
  select(-holiday_name, -day_num)

#add weekdays as another predictor
trips_per_day <- trips_per_day %>% 
  mutate(weekday = weekdays(as.POSIXct(ymd), abbreviate = T))

# add flu_season as another predictor
flu_season <- c("12", "01", "02")
trips_per_day <- trips_per_day %>% 
  mutate(month = format(ymd,"%m"),
         is_flu_season = month %in% flu_season) %>% 
  select(-month)

Split the data into different sets

set.seed(42)
# sample split using caTools package
sample <- sample.split(trips_per_day$num_trips, SplitRatio = 0.8)
train_data <- subset(trips_per_day, sample== T)
other_data <- subset(trips_per_day, sample== F)
sample2 <- sample.split(other_data$num_trips, SplitRatio = 0.5)
validate_data <- subset(other_data, sample2== T)
test_data <- subset(other_data, sample2== F)

# Check if the split works
nrow(trips_per_day)==sum(nrow(train_data), nrow(test_data), nrow(validate_data))
## [1] TRUE

Modeling with only minimum tempurature as predictor

K <- 1:8
train_err <- c()
validate_err <- c()
for (k in K){
  # fit on the training data
  model <- lm(num_trips ~ poly(tmin, k, raw = T), train_data)
  
  # evaluate on the training data
  # RMSE-> root mean square error. sqrt( (predict-truth)^2 /number of data )
  train_err[k] <- sqrt(mean((predict(model, train_data)-train_data$num_trips)^2))
  validate_err[k] <- sqrt(mean((predict(model, validate_data ) - validate_data$num_trips)^2))
}

plot_data <- data.frame(K, train_err, validate_err) %>% 
  gather("split","error",-K)

ggplot(plot_data, aes(x=K, y = error, color = split)) +
  geom_line() +
  scale_x_continuous(breaks = K) +
  xlab('Polynomial Degrees') +
  ylab('RMSE')

Cross-Validation with all the predictors

# fit a model for each polynomial degree consider 7 predictors
K <- 1:8
train_err <- c()
validate_err <- c()
for (k in K){
  # fit on the training data
  model <- lm(num_trips ~ poly(prcp, k, raw = T) + poly(snwd, k, raw = T) + poly(tmax, k, raw = T) + poly(tmin, k, raw = T) + weekday + is_holiday + is_flu_season, train_data)
  
  # evaluate on the training data
  # RMSE-> root mean square error. sqrt( (predict-truth)^2 /number of data )
  train_err[k] <- sqrt(mean((predict(model, train_data)-train_data$num_trips)^2))
  validate_err[k] <- sqrt(mean((predict(model, validate_data ) - validate_data$num_trips)^2))
}

plot_data <- data.frame(K, train_err, validate_err) %>% 
  gather("split","error",-K)

ggplot(plot_data, aes(x=K, y = error, color = split)) +
  geom_line() +
  scale_x_continuous(breaks = K) +
  xlab('Polynomial Degrees') +
  ylab('RMSE')

Polynomial of Degree two gives lowest validation error. Further investigation still nedded.

Further defined the model at polynomial of degree two

# Everything at degree 2
model <- lm(num_trips ~ poly(prcp, 2, raw = T)+poly(snwd, 2, raw = T)+poly(tmax, 2, raw = T)+poly(tmin, 2, raw = T) + weekday + is_holiday + is_flu_season, train_data)
glance(model)
# take out insignificant predictors
model <- lm(num_trips ~ poly(prcp, 2, raw = T)+ snwd + poly(tmax, 2, raw = T)+poly(tmin, 2, raw = T) + weekday + is_holiday + is_flu_season, train_data)

# check R^2
rsquare(model, train_data)
## [1] 0.8974348
rsquare(model, validate_data)
## [1] 0.8960873
# take out more insignificant predictors
model <- lm(num_trips ~ poly(prcp, 2, raw = T)+ snwd + tmax + tmin + weekday + is_holiday + is_flu_season, train_data)

#Check rmse and R^2 
rmse(model, train_data)
## [1] 3199.978
rmse(model, validate_data)
## [1] 3373.947
rsquare(model, train_data)
## [1] 0.8964213
rsquare(model, validate_data)
## [1] 0.8944062

Best model

# Best Model
model <- lm(num_trips ~ poly(prcp, 2, raw = T)+ snwd + tmax + tmin + weekday + is_holiday + is_flu_season, train_data)# 0.8976948
rsquare(model, train_data)
## [1] 0.8964213
tidy(model)
# test the model by combining train_data with validate_data
com_data <- rbind(train_data, validate_data)
rsquare(model, com_data)
## [1] 0.8967198
plot_data <- validate_data %>%
  add_predictions(model)
# use ggplotly for interactive plot
ggplotly(ggplot(plot_data, aes(x= ymd, y = pred))+
  geom_point(aes(y= num_trips))+
  geom_line(aes(y=pred), color = "red")+
  geom_point(aes(y=pred), color = "red") +
  geom_smooth() +
  xlab("Date") +
  ylab("Predicted (in red)/ Actual (in black)")+
    ggtitle("Number of trips at different dates"))
ggplot(plot_data, aes(x=pred, y =num_trips ))+
  geom_point()+
  geom_abline(linetype = "dashed") +
  xlab('Predicted') +
  ylab('Actual')+
  ggtitle("Predicted Value against Actual Value ")

Test the model on test set

rmse(model, test_data)
## [1] 13089.71
# negative R^2 !!
rsquare(model, test_data)
## [1] -0.2801334
# add predictions to test data
plot_test_data<- test_data %>% 
  add_predictions(model)

# plot ymd versus prediction and actual value
ggplotly(ggplot(plot_test_data, aes(x= ymd, y = pred))+
  geom_point(aes(y= num_trips))+
  geom_line(aes(y=pred), color = "red")+
  geom_point(aes(y=pred), color = "red") +
  geom_smooth() +
  xlab("Date") +
  ylab("Predicted (in red)/ Actual (in black)")+
    ggtitle("Number of trips at different dates"))
# removed the outlier
plot_data <- test_data %>% 
  add_predictions(model) %>% 
  filter(ymd!="2014-04-30")
#Small error  R^2 = 0.95
rmse(model, plot_data)
## [1] 2510.775
rsquare(model, plot_data)
## [1] 0.9504549
ggplotly(ggplot(plot_data, aes(x= ymd, y = pred))+
  geom_point(aes(y= num_trips))+
  geom_line(aes(y=pred), color = "red")+
  geom_point(aes(y=pred), color = "red") +
  geom_smooth() +
  xlab("Date") +
  ylab("Predicted (in red)/ Actual (in black)")+
    ggtitle("Number of trips at different dates"))

I’m expecting my model to produce R^2 around 0.89 on a new data set with a rmse around 3000.